In this notebook, code is made available that:
Decision made to only use cases/deaths data from Mar 16, 2020 to May 30, 2021 (lowest common denominator).
import covid19czechia as CZ
import covid19sweden as SE
import numpy as np
import plotly.express as px
import pandas as pd
from datetime import datetime, date, timedelta
import dtw
import plotly.io as pio
pio.renderers.default='notebook'
Importing the dtw module. When using in academic works please cite: T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package. J. Stat. Soft., doi:10.18637/jss.v031.i07.
# read in data with regional attributes (name, population, area, etc.)
regions = pd.read_csv("./data/regions.csv")
## set up population dissimilarity matrix
# extract region names
reg_names = regions['NUTS3'].unique() + ' - ' + regions['name'].unique()
# extract population data
population = dict(zip(regions.iloc[:,0], regions.iloc[:,2]))
# set up dissimilarity matrix
pop_diss = pd.DataFrame([[abs(population[reg_row] - population[reg_col])
for reg_col in regions['NUTS3'].unique()]
for reg_row in regions['NUTS3'].unique()],
columns = reg_names).rename(dict(enumerate(reg_names)))
# scale data
pop_diss = pop_diss / pop_diss.max().max()
## set up population density dissimilarity matrix
# extract population density data
population_density = dict(zip(regions.iloc[:,0], regions.iloc[:,2]/regions.iloc[:,3]))
# actual pop density difference
pop_dens_diss = pd.DataFrame([[abs(population_density[reg_row] - population_density[reg_col])
for reg_col in regions['NUTS3'].unique()]
for reg_row in regions['NUTS3'].unique()],
columns = reg_names).rename(dict(enumerate(reg_names)))
# scaled
pop_dens_diss = pop_dens_diss / pop_dens_diss.max().max()
## combine population and density dissimilarities
pop_pop_dens = np.sqrt(pop_diss**2 + pop_dens_diss**2)
pop_pop_dens.to_csv('./data/clustering_distance_datasets/pop_pop_dens.csv')
Cluster 1 (18 members):
SE214 - Gotland, SE322 - JƤmtland, SE221 - Blekinge, SE212 - Kronoberg, SE213 - Kalmar,
SE321 - VƤsternorrland, SE332 - Norrbotten, SE331 - VƤsterbotten, SE312 - Dalarna, SE311 - VƤrmland,
SE313 - GƤvleborg, SE124 - Ćrebro, SE122 - Sƶdermanland, SE125 - VƤstmanland, CZ041 - Karlovy Vary,
SE231 - Halland, SE211 - Jƶnkƶping, SE121 - Uppsala
Cluster 2 (9 members):
SE123 - Ćstergƶtland, CZ051 - Liberec, CZ063 - VysoÄina, CZ053 - Pardubice, CZ052 - Hradec KrĆ”lovĆ©,
CZ032 - PlzeÅ, CZ031 - South Bohemian, CZ072 - ZlĆn, CZ071 - Olomouc
Cluster 3 (12 members):
CZ042 - ĆstĆ nad Labem, PL52 - Opolskie, PL43 - Lubuskie, PL84 - Podlaskie, CZ064 - South Moravian,
PL72 - ÅwiÄtokrzyskie, CZ080 - Moravian-Silesian, CZ020 - Central Bohemian, SE224 - SkĆ„ne,
PL62 - WarmiÅsko-Mazurskie, PL42 - Zachodniopomorskie, SE232 - VƤstra Gƶtaland
Cluster 4 (6 members):
PL61 - Kujawsko-Pomorskie, PL82 - Podkarpackie, PL81 - Lubelskie, PL63 - Pomorskie,
PL71 - Åódzkie, SE110 - Stockholm
Cluster 5 (5 members):
PL51 - DolnoÅlÄ
skie, PL21 - MaÅopolskie, PL41 - Wielkopolskie, PL22 - ÅlÄ
skie, PL9 - Mazowieckie
Cluster 6 (singleton):
CZ010 - Prague
# set up clusters in the regions dataset
regions.loc[regions['NUTS3'].isin(['SE214', 'SE322', 'SE221', 'SE212', 'SE213', 'SE321', \
'SE332', 'SE331', 'SE312', 'SE311', 'SE313', 'SE124', \
'SE122', 'SE125', 'CZ041', 'SE231', 'SE211', 'SE121']), 'cluster_1'] = 1
regions.loc[regions['NUTS3'].isin(['SE123', 'CZ051', 'CZ063', 'CZ053', 'CZ052', 'CZ032', \
'CZ031', 'CZ072', 'CZ071']), 'cluster_1'] = 2
regions.loc[regions['NUTS3'].isin(['CZ042', 'PL52', 'PL43', 'PL84', 'CZ064', 'PL72', \
'CZ080', 'CZ020', 'SE224', 'PL62', 'PL42', 'SE232']), 'cluster_1'] = 3
regions.loc[regions['NUTS3'].isin(['PL61', 'PL82', 'PL81', 'PL63', 'PL71', 'SE110']), 'cluster_1'] = 4
regions.loc[regions['NUTS3'].isin(['PL51', 'PL21', 'PL41', 'PL22', 'PL9']), 'cluster_1'] = 5
regions.loc[regions['NUTS3'] == 'CZ010', 'cluster_1'] = 6
regions['cluster_2'] = regions['cluster_1'] - 1
regions.loc[regions['cluster_2'] == 0, 'cluster_2'] = 1
# rename column 'NUTS3' as region
regions = regions.rename(columns = {'NUTS3':'region'})
# utility functions
### 1
def week_num_to_date(year, week_num):
if year == 2020:
return(datetime.strptime(str(year) + str(week_num) + '-1', '%Y%W-%w') - timedelta(days = 7))
elif year == 2021:
return(datetime.strptime(str(year) + str(week_num) + '-1', '%Y%W-%w'))
### 2
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days/7) + 1):
yield start_date + timedelta(7*n)
# get PL data
#file_name = './data/pl_google_sheet/' + datetime.today().strftime('%Y%m%d') + '.xlsx'
file_name = './data/pl_google_sheet/' + '20210606' + '.xlsx'
### read in PL cases data
pl_cases = pd.read_excel(io = file_name,
sheet_name='Wzrost w województwach',
dtype=object,
engine='openpyxl',
skiprows=7,
nrows=16)
### convert wide form data to long form (for facilitating future merge)
pl_value_vars = list(pl_cases.columns)[1:-2]
pl_cases = pd.melt(frame = pl_cases.iloc[:, 0:-2],
id_vars='Województwo',
value_vars=pl_value_vars,
var_name='date',
value_name='cases')
### get year and week data based on date
pl_cases['year'] = pl_cases['date'].apply(lambda x: int(x.year))
pl_cases['week'] = pl_cases['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_cases.loc[(pl_cases['week'] == 53) & (pl_cases['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_cases = pl_cases.groupby(['year', 'week', 'Województwo']).aggregate({'cases': 'sum'}).reset_index()
pl_cases['date'] = pl_cases.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
### read in PL deaths data
pl_deaths = pd.read_excel(io = file_name,
sheet_name='Wzrost w województwach',
dtype=object,
engine='openpyxl',
skiprows=50,
nrows=16)
### convert wide form data to long form (for facilitating future merge)
pl_deaths = pd.melt(frame = pl_deaths.iloc[:, 0:-2],
id_vars='Województwo',
value_vars=pl_value_vars,
var_name='date',
value_name='deaths')
### get year and week data based on date
pl_deaths['year'] = pl_deaths['date'].apply(lambda x: int(x.year))
pl_deaths['week'] = pl_deaths['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_deaths.loc[(pl_deaths['week'] == 53) & (pl_deaths['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_deaths = pl_deaths.groupby(['year', 'week', 'Województwo']).aggregate({'deaths': 'sum'}).reset_index()
pl_deaths['date'] = pl_deaths.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
### read in 'new' PL tests data (tests after 28.12.2020)
pl_tests_new = pd.read_excel(io = file_name,
sheet_name='Testy w województwach',
dtype=object,
engine='openpyxl',
skiprows=3,
nrows=16)
### convert wide form data to long form (for facilitating future merge)
pl_tests_new = pd.melt(frame = pl_tests_new.iloc[:, 1:-2],
id_vars='Województwo',
value_vars=list(pl_tests_new.columns)[2:-2],
var_name='date',
value_name='tests')
### filter out old data
pl_tests_new = pl_tests_new[pl_tests_new['date'] >= '2020-12-28'] # only pick up data from 28 Dec 2020 or later
### get year and week data based on date
pl_tests_new['year'] = pl_tests_new['date'].apply(lambda x: int(x.year))
pl_tests_new['week'] = pl_tests_new['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_tests_new.loc[(pl_tests_new['week'] == 53) & (pl_tests_new['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_tests_new = pl_tests_new.groupby(['year', 'week', 'Województwo']).aggregate({'tests': 'sum'}).reset_index()
pl_tests_new['date'] = pl_tests_new.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
### read in 'old' PL tests data (tests up to 28.12.2020)
pl_wkly_tests_old = pd.read_excel(io = file_name,
sheet_name=' Testy w województwach od 11.05', # sheet name appears as 'Testy w województwach od 11.05 do 28.12.2020' in some files
dtype=object,
engine='openpyxl',
skiprows=2,
nrows=16,
verbose=False,
parse_dates=False,
date_parser=None)
pl_wkly_tests_old.iloc[12, 0] = 'ÅwiÄtokrzyskie' # source spreadsheet reports this region as 'ÅwiÄtokrzyskie*'
### convert cumulative to weekly counts
pl_wkly_tests_old.iloc[:, 1:-5] = pl_wkly_tests_old.iloc[:, 1:-5].diff(axis = 1)
### filter out first week tests info as that appears to be a cumulative figure which could not be corrected
### due to lack of data before that week
pl_wkly_tests_old.drop(columns = pl_wkly_tests_old.columns[1], inplace = True, errors = 'raise')
### convert wide form data to long form (for facilitating future merge)
pl_wkly_tests_old = pd.melt(frame = pl_wkly_tests_old.iloc[:, 0:-5],
id_vars='Województwo',
value_vars=list(pl_wkly_tests_old.columns)[1:-5],
var_name='date',
value_name='tests')
### fix stats mistake in ÅwiÄtokrzyskie
pl_wkly_tests_old.loc[
(pl_wkly_tests_old['Województwo'] == 'ÅwiÄtokrzyskie') &
(pl_wkly_tests_old.date <= datetime(2020,8,10)), 'tests'] = np.nan
### get year and week data based on date
pl_wkly_tests_old['year'] = pl_wkly_tests_old['date'].apply(lambda x: int(x.year))
pl_wkly_tests_old['week'] = pl_wkly_tests_old['date'].apply(lambda x: int(x.isocalendar()[1]))
### filter out new data
pl_wkly_tests_old = pl_wkly_tests_old[pl_wkly_tests_old['date'] < '2020-12-28'] # only pick up data from before 28 Dec 2020
# aggregate PL data
pl_data = pd.merge(pd.merge(pl_cases, pl_deaths, on = ['Województwo', 'date', 'year', 'week'], how = 'left'),
pd.concat([pl_tests_new, pl_wkly_tests_old]), on = ['Województwo', 'date', 'year', 'week'], how = 'left') \
.rename({'Województwo':'name'}, axis = 1)
# aggregate SE data
se_data = SE.covid_deaths()
se_data = se_data.groupby(['year', 'week', 'region']) \
.aggregate({'deaths':'sum', 'confirmed':'sum'}).reset_index()
se_data['date'] = se_data.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
se_data = se_data.rename({'confirmed':'cases'}, axis = 1)
### read in CZ cases data
cz_cases = CZ.covid_confirmed(level = 2)
cz_cases['year'] = cz_cases['date'].apply(lambda x: x.year)
cz_cases.loc[(cz_cases['week'] == 53) & (cz_cases['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
cz_cases = cz_cases.groupby(['year', 'week', 'region']).aggregate({'confirmed': 'sum'}).reset_index()
cz_cases['date'] = cz_cases.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
cz_cases = cz_cases.rename({'confirmed':'cases'}, axis = 1)
/Users/stray/miniconda3/lib/python3.7/site-packages/covid19czechia/mzcr.py:129: DtypeWarning: Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.
### read in CZ deaths data
cz_deaths = CZ.covid_deaths(level = 2)
cz_deaths['year'] = cz_deaths['date'].apply(lambda x: x.year)
cz_deaths.loc[(cz_deaths['week'] == 53) & (cz_deaths['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
cz_deaths = cz_deaths.groupby(['year', 'week', 'region']).aggregate({'deaths': 'sum'}).reset_index()
cz_deaths['date'] = cz_deaths.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
### read in CZ tests data
cz_tests = CZ.covid_tests(level = 2)
cz_tests['year'] = cz_tests['date'].apply(lambda x: x.year)
cz_tests.loc[(cz_tests['week'] == 53) & (cz_tests['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
cz_tests = cz_tests.groupby(['year', 'week', 'region']).aggregate({'tests': 'sum'}).reset_index()
cz_tests['date'] = cz_tests.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
# aggregate CZ data
cz_data = pd.merge(pd.merge(cz_cases, cz_deaths, on = ['region', 'date', 'year', 'week'], how = 'left'),
cz_tests, on = ['region', 'date', 'year', 'week'], how = 'left')
# cross reference the PL/CZ/SE datasets with info in the regions file
pl_data = pd.merge(regions[['region', 'name', 'population', 'cluster_1', 'cluster_2']], pl_data,
on='name', how='inner')
cz_data = pd.merge(regions[['region', 'name', 'population', 'cluster_1', 'cluster_2']], cz_data,
on='region', how='inner')
se_data = pd.merge(regions[['region', 'name', 'population', 'cluster_1', 'cluster_2']], se_data,
on='region', how='inner')
# concatenate all the three datasets
all_data = pd.concat([pl_data, cz_data, se_data])
# concatenate NUTS code and region name
all_data['region'] = all_data['region'] + ' - ' + all_data['name']
# fix datatype of 'tests' column (handle NaN)
all_data['tests'] = all_data['tests'].astype('float')# concatenate all the three datasets
all_data = pd.concat([pl_data, cz_data, se_data])
# concatenate NUTS code and region name
all_data['region'] = all_data['region'] + ' - ' + all_data['name']
# fix datatype of 'tests' column (handle NaN)
all_data['tests'] = all_data['tests'].astype('float')
# compute deaths and cases per 100K capita, deaths per test, cases per test
all_data['cases_100K'] = all_data['cases']/all_data['population']*100000
all_data['deaths_100K'] = all_data['deaths']/all_data['population']*100000
all_data['tests_100K'] = all_data['tests']/all_data['population']*100000
all_data['cases_per_test'] = all_data['cases']/all_data['tests']
all_data['deaths_per_test'] = all_data['deaths']/all_data['tests']
all_data['deaths_per_case'] = all_data['deaths']/all_data['cases']
# exclude immature data
all_data = all_data[all_data['date'] <= '2021-05-30'] # change when data updates are done
all_data = all_data[all_data['date'] >= '2020-03-16']
# impute missing tests/cases/deaths
regions_series = all_data['region'].unique()
start_date = all_data['date'].min() # by construction, this is the min date of reporting of cases and deaths, viz. Mar 16, 2020
end_date = all_data['date'].max() # by construction, this is the max date of reporting of cases and deaths, viz. Mar 29, 2021
cz_tests_min_date = cz_tests['date'].min()
cz_tests_max_date = cz_tests['date'].max()
pl_tests_min_date = pd.concat([pl_tests_new, pl_wkly_tests_old])['date'].min()
pl_tests_max_date = pd.concat([pl_tests_new, pl_wkly_tests_old])['date'].max()
for dt in daterange(start_date, end_date):
for reg in regions_series:
record = all_data[(all_data['date'] == dt) & (all_data['region'] == reg)]
if record.shape[0] == 0: # row does not exist - so create it
year = dt.year
week = dt.isocalendar()[1]
if week == 53 & dt.month == 1:
year = year - 1
if reg.startswith('PL', 0, 2) and dt >= pl_tests_min_date and dt <= pl_tests_max_date:
tests = 0
tests_100K = 0
cases_per_test = 0
deaths_per_test = 0
elif reg.startswith('CZ', 0, 2) and dt >= cz_tests_min_date and dt <= cz_tests_max_date:
tests = 0
tests_100K = 0
cases_per_test = 0
deaths_per_test = 0
else:
tests = np.nan
tests_100K = np.nan
cases_per_test = np.nan
deaths_per_test = np.nan
region_rec = regions[regions['region'] == reg.split(' - ')[0]]
cluster_1 = region_rec.iloc[0, 5]
cluster_2 = region_rec.iloc[0, 6]
append_record = {'region': reg,
'year': year,
'cluster_1': cluster_1,
'cluster_2': cluster_2,
'week': week,
'cases': 0,
'date': dt,
'deaths': 0,
'tests': tests,
'cases_100K': 0,
'deaths_100K': 0,
'tests_100K': tests_100K,
'cases_per_test': cases_per_test,
'deaths_per_test': deaths_per_test,
'deaths_per_case': 0
}
all_data = all_data.append(append_record, ignore_index = True)
else:
if pd.isnull(record['cases']).item():
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'cases'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'cases_100K'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'cases_per_test'] = 0
if pd.isnull(record['deaths']).item():
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'deaths'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'deaths_100K'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'deaths_per_test'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'deaths_per_case'] = 0
# Note that an assumption here is there will never be deaths if there are no cases
if pd.isnull(record['tests']).item():
if reg.startswith('PL', 0, 2) and dt >= pl_tests_min_date and dt <= pl_tests_max_date:
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'tests'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'tests_100K'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'cases_per_test'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'deaths_per_test'] = 0
if reg.startswith('CZ', 0, 2) and dt >= cz_tests_min_date and dt <= cz_tests_max_date:
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'tests'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'tests_100K'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'cases_per_test'] = 0
all_data.loc[(all_data['date'] == dt) & (all_data['region'] == reg), 'deaths_per_test'] = 0
# Note that an assumption here is there will never be cases/deaths if there are no tests
# sort data before plot
all_data.sort_values(by = ['date', 'region'], axis=0, ascending=True, inplace=True)
# visualise trends in all regions
# 1
fig = px.line(all_data[pd.notnull(all_data['cases'])], x='date', y='cases', color='region',
title='Weekly Cases by Region')
fig.show()
# 2
fig = px.line(all_data[pd.notnull(all_data['cases_100K'])], x='date', y='cases_100K', color='region',
title='Weekly Cases per 100K Capita by Region')
fig.show()
# 3
fig = px.line(all_data[pd.notnull(all_data['deaths'])], x='date', y='deaths', color='region',
title='Weekly COVID19 Deaths by Region')
fig.show()
# 4
fig = px.line(all_data[pd.notnull(all_data['deaths_100K'])], x='date', y='deaths_100K', color='region',
title='Weekly COVID19 Deaths per 100K Capita by Region')
fig.show()
# 5
fig = px.line(all_data[pd.notnull(all_data['tests'])], x='date', y='tests', color='region',
title='Weekly Tests by Region')
fig.show()
# 6
fig = px.line(all_data[pd.notnull(all_data['tests_100K'])], x='date', y='tests', color='region',
title='Weekly Tests per 100K Capita by Region')
fig.show()
# 7
fig = px.line(all_data[(pd.notnull(all_data['cases_per_test'])) &
(pd.notnull(all_data['cases'])) &
(pd.notnull(all_data['tests']))],
x='date', y='cases_per_test', color='region',
title='Weekly Cases per Test by Region')
fig.show()
# 8
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_test'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['tests']))],
x='date', y='deaths_per_test', color='region',
title='Weekly COVID19 Deaths per Test by Region')
fig.show()
# 9
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_case'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['cases']))],
x='date', y='deaths_per_case', color='region',
title='Weekly COVID19 Deaths per Case by Region')
fig.show()
# prepare data for cluster wise graphs
cluster1_data = all_data.groupby(['cluster_1','date']).aggregate({'cases': 'sum',
'deaths': 'sum',
'tests': 'sum',
'population':'sum'}).reset_index()
cluster1_data['cases_100K'] = cluster1_data['cases']/all_data['population']*100000
cluster1_data['deaths_100K'] = cluster1_data['deaths']/all_data['population']*100000
cluster1_data['tests_100K'] = cluster1_data['tests']/all_data['population']*100000
cluster1_data['cases_per_test'] = cluster1_data['cases']/all_data['tests']
cluster1_data['deaths_per_test'] = cluster1_data['deaths']/all_data['tests']
cluster1_data['deaths_per_case'] = cluster1_data['deaths']/all_data['cases']
cluster1_data.sort_values(by = ['date', 'cluster_1'], axis=0, ascending=True, inplace=True)
cluster2_data = all_data.groupby(['cluster_2','date']).aggregate({'cases': 'sum',
'deaths': 'sum',
'tests': 'sum',
'population':'sum'}).reset_index()
cluster2_data['cases_100K'] = cluster2_data['cases']/all_data['population']*100000
cluster2_data['deaths_100K'] = cluster2_data['deaths']/all_data['population']*100000
cluster2_data['tests_100K'] = cluster2_data['tests']/all_data['population']*100000
cluster2_data['cases_per_test'] = cluster2_data['cases']/all_data['tests']
cluster2_data['deaths_per_test'] = cluster2_data['deaths']/all_data['tests']
cluster2_data['deaths_per_case'] = cluster2_data['deaths']/all_data['cases']
cluster2_data.sort_values(by = ['date', 'cluster_2'], axis=0, ascending=True, inplace=True)
# visualise trends in all apriori clusters: cluster 1
# 1
fig = px.line(cluster1_data[pd.notnull(cluster1_data['cases'])], x='date', y='cases', color='cluster_1',
title='Weekly Cases by Apriori Cluster')
fig.show()
# 2
fig = px.line(cluster1_data[pd.notnull(cluster1_data['cases_100K'])], x='date', y='cases_100K',
color='cluster_1', title='Weekly Cases per 100K Capita by Apriori Cluster')
fig.show()
# 3
fig = px.line(cluster1_data[pd.notnull(cluster1_data['deaths'])], x='date', y='deaths', color='cluster_1',
title='Weekly COVID19 Deaths by Apriori Cluster')
fig.show()
# 4
fig = px.line(cluster1_data[pd.notnull(cluster1_data['deaths_100K'])],
x='date', y='deaths_100K', color='cluster_1',
title='Weekly COVID19 Deaths per 100K Capita by Apriori Cluster')
fig.show()
# 5
fig = px.line(cluster1_data[pd.notnull(cluster1_data['tests'])], x='date', y='tests', color='cluster_1',
title='Weekly Tests by Apriori Cluster')
fig.show()
# 6
fig = px.line(cluster1_data[pd.notnull(cluster1_data['tests_100K'])], x='date', y='tests', color='cluster_1',
title='Weekly Tests per 100K Capita by Apriori Cluster')
fig.show()
# 7
fig = px.line(cluster1_data[(pd.notnull(cluster1_data['cases_per_test'])) &
(pd.notnull(cluster1_data['cases'])) &
(pd.notnull(cluster1_data['tests']))],
x='date', y='cases_per_test', color='cluster_1',
title='Weekly Cases per Test by Apriori Cluster')
fig.show()
# 8
fig = px.line(cluster1_data[(pd.notnull(cluster1_data['deaths_per_test'])) &
(pd.notnull(cluster1_data['deaths'])) &
(pd.notnull(cluster1_data['tests']))],
x='date', y='deaths_per_test', color='cluster_1',
title='Weekly COVID19 Deaths per Test by Apriori Cluster')
fig.show()
# 9
fig = px.line(cluster1_data[(pd.notnull(cluster1_data['deaths_per_case'])) &
(pd.notnull(cluster1_data['deaths'])) &
(pd.notnull(cluster1_data['cases']))],
x='date', y='deaths_per_case', color='cluster_1',
title='Weekly COVID19 Deaths per Case by Apriori Cluster')
fig.show()
# visualise trends in all apriori clusters: cluster 2
# 1
fig = px.line(cluster2_data[pd.notnull(cluster2_data['cases'])], x='date', y='cases', color='cluster_2',
title='Weekly Cases by Apriori Cluster')
fig.show()
# 2
fig = px.line(cluster2_data[pd.notnull(cluster2_data['cases_100K'])],
x='date', y='cases_100K', color='cluster_2',
title='Weekly Cases per 100K Capita by Apriori Cluster')
fig.show()
# 3
fig = px.line(cluster2_data[pd.notnull(cluster2_data['deaths'])], x='date', y='deaths', color='cluster_2',
title='Weekly COVID19 Deaths by Apriori Cluster')
fig.show()
# 4
fig = px.line(cluster2_data[pd.notnull(cluster2_data['deaths_100K'])],
x='date', y='deaths_100K', color='cluster_2',
title='Weekly COVID19 Deaths per 100K Capita by Apriori Cluster')
fig.show()
# 5
fig = px.line(cluster2_data[pd.notnull(cluster2_data['tests'])], x='date', y='tests', color='cluster_2',
title='Weekly Tests by Apriori Cluster')
fig.show()
# 6
fig = px.line(cluster2_data[pd.notnull(cluster2_data['tests_100K'])], x='date', y='tests', color='cluster_2',
title='Weekly Tests per 100K Capita by Apriori Cluster')
fig.show()
# 7
fig = px.line(cluster2_data[(pd.notnull(cluster2_data['cases_per_test'])) &
(pd.notnull(cluster2_data['cases'])) &
(pd.notnull(cluster2_data['tests']))],
x='date', y='cases_per_test', color='cluster_2',
title='Weekly Cases per Test by Apriori Cluster')
fig.show()
# 8
fig = px.line(cluster2_data[(pd.notnull(cluster2_data['deaths_per_test'])) &
(pd.notnull(cluster2_data['deaths'])) &
(pd.notnull(cluster2_data['tests']))],
x='date', y='deaths_per_test', color='cluster_2',
title='Weekly COVID19 Deaths per Test by Apriori Cluster')
fig.show()
# 9
fig = px.line(cluster2_data[(pd.notnull(cluster2_data['deaths_per_case'])) &
(pd.notnull(cluster2_data['deaths'])) &
(pd.notnull(cluster2_data['cases']))],
x='date', y='deaths_per_case', color='cluster_2',
title='Weekly COVID19 Deaths per Case by Apriori Cluster')
fig.show()
Six clusters of regions are created below:
N.B. Clusters based on tests (incl. those based on cases or deaths per test) have not been constructed because tests data are not available for Sweden
# obtain distance matrices for clustering
### initialise
dm1 = np.zeros([len(regions_series), len(regions_series)])
dm2 = np.zeros([len(regions_series), len(regions_series)])
dm3 = np.zeros([len(regions_series), len(regions_series)])
dm1_dtw = np.zeros([len(regions_series), len(regions_series)])
dm2_dtw = np.zeros([len(regions_series), len(regions_series)])
dm3_dtw = np.zeros([len(regions_series), len(regions_series)])
### compute and store distances
for reg_id, reg in enumerate(regions_series):
time_series_1_dm1 = all_data[(all_data['region'] == reg) & pd.notnull(all_data['cases_100K'])] \
[['cases_100K', 'date']].sort_values(by = 'date').set_index('date').transpose()
time_series_1_dm2 = all_data[(all_data['region'] == reg) & pd.notnull(all_data['deaths_100K'])] \
[['deaths_100K', 'date']].sort_values(by = 'date').set_index('date').transpose()
time_series_1_dm3 = all_data[(all_data['region'] == reg) & pd.notnull(all_data['deaths_per_case'])] \
[['deaths_per_case', 'date']].sort_values(by = 'date').set_index('date').transpose().fillna(0)
### NOTE: transpose being done to facilitate date-wise difference calculations
for reg_id_2 in range(reg_id + 1, len(regions_series)):
time_series_2_dm1 = \
all_data[all_data['region'] == regions_series[reg_id_2]][['cases_100K', 'date']] \
.sort_values(by = 'date').set_index('date').transpose()
time_series_2_dm2 = \
all_data[all_data['region'] == regions_series[reg_id_2]][['deaths_100K', 'date']]\
.sort_values(by = 'date').set_index('date').transpose()
time_series_2_dm3 = \
all_data[all_data['region'] == regions_series[reg_id_2]][['deaths_per_case', 'date']]\
.sort_values(by = 'date').set_index('date').transpose().fillna(0)
diff_1 = abs(time_series_1_dm1 - time_series_2_dm1).transpose()
diff_1 = diff_1[pd.notnull(diff_1['cases_100K'])]
diff_2 = abs(time_series_1_dm2 - time_series_2_dm2).transpose()
diff_2 = diff_2[pd.notnull(diff_2['deaths_100K'])]
diff_3 = abs(time_series_1_dm3 - time_series_2_dm3).transpose()
diff_3 = diff_3[pd.notnull(diff_3['deaths_per_case'])]
### NOTE: transpose being done here (again) to exclude null values, which are
### generated if one time-series has no value for a given date, but the other does
dm1[reg_id, reg_id_2] = diff_1.max().item()
dm1[reg_id_2, reg_id] = dm1[reg_id, reg_id_2]
dm2[reg_id, reg_id_2] = diff_2.max().item()
dm2[reg_id_2, reg_id] = dm2[reg_id, reg_id_2]
dm3[reg_id, reg_id_2] = diff_3.max().item()
dm3[reg_id_2, reg_id] = dm3[reg_id, reg_id_2]
dm1_dtw[reg_id, reg_id_2] = dtw.dtw(time_series_1_dm1, time_series_2_dm1).normalizedDistance
dm1_dtw[reg_id_2, reg_id] = dm1_dtw[reg_id, reg_id_2]
dm2_dtw[reg_id, reg_id_2] = dtw.dtw(time_series_1_dm2, time_series_2_dm2).normalizedDistance
dm2_dtw[reg_id_2, reg_id] = dm2_dtw[reg_id, reg_id_2]
dm3_dtw[reg_id, reg_id_2] = dtw.dtw(time_series_1_dm3, time_series_2_dm3).normalizedDistance
dm3_dtw[reg_id_2, reg_id] = dm3_dtw[reg_id, reg_id_2]
# export data files to csv
pd.DataFrame(dm1, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/ts_cases_100k_wo_dtw.csv')
pd.DataFrame(dm2, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/ts_deaths_100k_wo_dtw.csv')
pd.DataFrame(dm3, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/ts_deaths_per_case_wo_dtw.csv')
pd.DataFrame(dm1_dtw, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/ts_cases_100k_w_dtw.csv')
pd.DataFrame(dm2_dtw, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/ts_deaths_100k_w_dtw.csv')
pd.DataFrame(dm3_dtw, columns = regions_series).rename(dict(enumerate(regions_series))) \
.to_csv('./data/clustering_distance_datasets/ts_deaths_per_case_w_dtw.csv')
# visualise trends in all PL regions
# 1
fig = px.line(all_data[(pd.notnull(all_data['cases'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='cases', color='region', title='Weekly Cases by Region - Poland')
fig.show()
# 2
fig = px.line(all_data[(pd.notnull(all_data['cases_100K'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='cases_100K', color='region', title='Weekly Cases per 100K Capita by Region - Poland')
fig.show()
# 3
fig = px.line(all_data[(pd.notnull(all_data['deaths'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='deaths', color='region', title='Weekly COVID19 Deaths by Region - Poland')
fig.show()
# 4
fig = px.line(all_data[(pd.notnull(all_data['deaths_100K'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='deaths_100K', color='region',
title='Weekly COVID19 Deaths per 100K Capita by Region - Poland')
fig.show()
# 5
fig = px.line(all_data[(pd.notnull(all_data['tests'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='tests', color='region', title='Weekly Tests by Region - Poland')
fig.show()
# 6
fig = px.line(all_data[(pd.notnull(all_data['tests_100K'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='tests', color='region', title='Weekly Tests per 100K Capita by Region - Poland')
fig.show()
# 7
fig = px.line(all_data[(pd.notnull(all_data['cases_per_test'])) &
(pd.notnull(all_data['cases'])) &
(pd.notnull(all_data['tests'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='cases_per_test', color='region',
title='Weekly Cases per Test by Region - Poland')
fig.show()
# 8
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_test'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['tests'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='deaths_per_test', color='region',
title='Weekly COVID19 Deaths per Test by Region - Poland')
fig.show()
# 9
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_case'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['cases'])) & (all_data['region'].str.startswith('PL'))],
x='date', y='deaths_per_case', color='region',
title='Weekly COVID19 Deaths per Case by Region - Poland')
fig.show()
# visualise trends in all CZ regions
# 1
fig = px.line(all_data[(pd.notnull(all_data['cases'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='cases', color='region', title='Weekly Cases by Region - Czechia')
fig.show()
# 2
fig = px.line(all_data[(pd.notnull(all_data['cases_100K'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='cases_100K', color='region', title='Weekly Cases per 100K Capita by Region - Czechia')
fig.show()
# 3
fig = px.line(all_data[(pd.notnull(all_data['deaths'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='deaths', color='region', title='Weekly COVID19 Deaths by Region - Czechia')
fig.show()
# 4
fig = px.line(all_data[(pd.notnull(all_data['deaths_100K'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='deaths_100K', color='region',
title='Weekly COVID19 Deaths per 100K Capita by Region - Czechia')
fig.show()
# 5
fig = px.line(all_data[(pd.notnull(all_data['tests'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='tests', color='region', title='Weekly Tests by Region - Czechia')
fig.show()
# 6
fig = px.line(all_data[(pd.notnull(all_data['tests_100K'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='tests', color='region', title='Weekly Tests per 100K Capita by Region - Czechia')
fig.show()
# 7
fig = px.line(all_data[(pd.notnull(all_data['cases_per_test'])) &
(pd.notnull(all_data['cases'])) &
(pd.notnull(all_data['tests'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='cases_per_test', color='region',
title='Weekly Cases per Test by Region - Czechia')
fig.show()
# 8
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_test'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['tests'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='deaths_per_test', color='region',
title='Weekly COVID19 Deaths per Test by Region - Czechia')
fig.show()
# 9
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_case'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['cases'])) & (all_data['region'].str.startswith('CZ'))],
x='date', y='deaths_per_case', color='region',
title='Weekly COVID19 Deaths per Case by Region - Czechia')
fig.show()
# visualise trends in all SE regions
# 1
fig = px.line(all_data[(pd.notnull(all_data['cases'])) & (all_data['region'].str.startswith('SE'))],
x='date', y='cases', color='region', title='Weekly Cases by Region - Sweden')
fig.show()
# 2
fig = px.line(all_data[(pd.notnull(all_data['cases_100K'])) & (all_data['region'].str.startswith('SE'))],
x='date', y='cases_100K', color='region', title='Weekly Cases per 100K Capita by Region - Sweden')
fig.show()
# 3
fig = px.line(all_data[(pd.notnull(all_data['deaths'])) & (all_data['region'].str.startswith('SE'))],
x='date', y='deaths', color='region', title='Weekly COVID19 Deaths by Region - Sweden')
fig.show()
# 4
fig = px.line(all_data[(pd.notnull(all_data['deaths_100K'])) & (all_data['region'].str.startswith('SE'))],
x='date', y='deaths_100K', color='region',
title='Weekly COVID19 Deaths per 100K Capita by Region - Sweden')
fig.show()
# 9
fig = px.line(all_data[(pd.notnull(all_data['deaths_per_case'])) &
(pd.notnull(all_data['deaths'])) &
(pd.notnull(all_data['cases'])) & (all_data['region'].str.startswith('SE'))],
x='date', y='deaths_per_case', color='region',
title='Weekly COVID19 Deaths per Case by Region - Sweden')
fig.show()
#import eurostat_deaths
#total_deaths = eurostat_deaths.deaths(regions = ['PL', 'CZ', 'SE'], start = datetime(2015, 1, 1),
# output = 'total_deaths.csv')